Setup

library(tidyverse)
library(magrittr)
library(ngsReports)
library(here)
library(scales)
library(ggpubr)
library(kableExtra)
library(AnnotationHub)
library(ensembldb)
library(edgeR)
library(corrplot)
library(DT)
library(pander)
library(ggrepel)
library(pheatmap)
library(ggdendro)
library(Biostrings)
library(Gviz)
library(Rsamtools)
library(viridis)
library(ggfortify)
if (interactive()) setwd(here::here())
theme_set(theme_bw())
cores <- 6

Sequence information

ah <- AnnotationHub() %>%
  subset(species == "Homo sapiens") %>%
  subset(rdataclass == "EnsDb")
ensDb <- ah[["AH83216"]]
trEns <- transcripts(ensDb) %>%
  mcols() %>% 
  as_tibble()
trLen <- exonsBy(ensDb, "tx") %>%
  width() %>%
  vapply(sum, integer(1))
geneGcLen <- trLen %>%
  enframe() %>%
  set_colnames(c("tx_id", "length")) %>%
  left_join(trEns) %>%
  group_by(gene_id) %>% 
  summarise(
    aveLen = mean(length),
    maxLen = max(length), 
    aveGc = sum(length * gc_content) / sum(length),
    longestGc = gc_content[which.max(length)[[1]]]
  ) %>%
  mutate(
    aveGc =  aveGc / 100,
    longestGc = longestGc / 100
  )
trGcLen <- trLen %>%
  enframe() %>%
  set_colnames(c("tx_id", "length")) %>%
  left_join(trEns) %>%
  group_by(tx_id) %>% 
  summarise(
    aveLen = mean(length),
    maxLen = max(length), 
    aveGc = sum(length * gc_content) / sum(length),
    longestGc = gc_content[which.max(length)[[1]]]
  ) %>%
  mutate(
    aveGc =  aveGc / 100,
    longestGc = longestGc / 100
  )
genesGR <- genes(ensDb)
mcols(genesGR) <- mcols(genesGR)[c("gene_id", "gene_name", 
                                         "gene_biotype", "entrezid")]
txGR <- transcripts(ensDb)
mcols(txGR) <- mcols(txGR)[c("tx_id", "tx_name", 
                                   "tx_biotype", "tx_id_version", "gene_id")]

An EnsDb object was obtained for Ensembl release 101 using the AnnotationHub package. This provided the GC content and length for every gene and transcript in the release. For humans, this consists of 67990 genes and 252335 transcripts.

Raw data

Sample information

files <- list.files(
  path = "/hpcfs/users/a1647910/20200310_rRNADepletion/4_T47D_ZR75_DHT_StrippedSerum/0_rawData/FastQC",
  pattern = "zip",
  full.names = TRUE
)
samples <- tibble(
  sample = str_remove(basename(files), "_fastqc.zip"),
  dataset = NA,
  organism = NA
) %>%
  mutate(
    dataset = "StrippedSerum",
    organism = "human"
  )
datasets <- samples$dataset %>% 
  unique()

The following analysis involves 16 paired-end samples across 1 dataset(s): StrippedSerum.

Library sizes

rawFqc <- files %>%
  FastqcDataList()
data <- grep("GLL", fqName(rawFqc))
labels <- rawFqc[data] %>%
  fqName() %>%
  str_remove("_001\\.fastq\\.gz") %>%
  str_remove("Ps2Ex3M1_")
rawLib <- plotReadTotals(rawFqc[data]) +
  labs(subtitle = "StrippedSerum") + 
  scale_x_discrete(labels = labels)

The library sizes of the unprocessed dataset(s) range between 22,057,776 and 42,743,934 reads.

rawLib

GC content

rRNA transcripts are known to have high GC content. Therefore, inspecting the GC content of the raw reads is a logical start point for detecting incomplete rRNA removal. A spike in GC content at ~ 70% is expected if this is the case.

plotly::ggplotly(
  plotGcContent(
    x = rawFqc[data], 
    plotType = "line",
    gcType = "Transcriptome",
    species = "Hsapiens"
  ) +
    labs(title = "StrippedSerum Dataset (H. sapiens)") + 
    theme(legend.position="none")
) 

GC content of reads in the dataset. Clear spikes at about 70% GC are observed, which is likely due to incomplete rRNA depletion.

Overrepresented sequences

getModule(rawFqc, "Overrep") %>% 
  group_by(Sequence, Possible_Source) %>% 
  summarise(`Found In` = n(), `Highest Percentage` = max(Percentage)) %>% 
  arrange(desc(`Highest Percentage`), desc(`Found In`)) %>% 
  ungroup() %>% 
  dplyr::slice(1:30) %>%
  mutate(`Highest Percentage` = percent_format(0.01)(`Highest Percentage`/100)) %>%
  kable(
    align = "llrr", 
    caption = paste(
      "Top", nrow(.),"Overrepresented sequences.",
      "The number of samples they were found in is shown,",
      "along with the percentage of the most 'contaminated' sample."
    )
  ) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed", "responsive")
  )
Top 30 Overrepresented sequences. The number of samples they were found in is shown, along with the percentage of the most ‘contaminated’ sample.
Sequence Possible_Source Found In Highest Percentage
CCTTAGGCAACCTGGTGGTCCCCCGCTCCCGGGAGGTCACCATATTGATG No Hit 16 0.84%
CTGGAGTCTTGGAAGCTTGACTACCCTACGTTCTCCTACAAATGGACCTT No Hit 16 0.60%
CTCCTTAGGCAACCTGGTGGTCCCCCGCTCCCGGGAGGTCACCATATTGA No Hit 16 0.59%
CCCAAACCCACTCCACCTTACTACCAGACAACCTTAGCCAAACCATTTAC No Hit 8 0.55%
CCCCTCCTTAGGCAACCTGGTGGTCCCCCGCTCCCGGGAGGTCACCATAT No Hit 16 0.50%
CCAGGCTGGAGTGCAGTGGCTATTCACAGGCGCGATCCCACTACTGATCA No Hit 13 0.50%
CCCTCCTTAGGCAACCTGGTGGTCCCCCGCTCCCGGGAGGTCACCATATT No Hit 16 0.49%
CTTGGTTATAATTTTTCATCTTTCCCTTGCGGTACTATATCTATTGCGCC No Hit 16 0.49%
CTCCGTTTCCGACCTGGGCCGGTTCACCCCTCCTTAGGCAACCTGGTGGT No Hit 16 0.47%
CTTAGGCAACCTGGTGGTCCCCCGCTCCCGGGAGGTCACCATATTGATGC No Hit 15 0.45%
CCCTGTTCTTGGGTGGGTGTGGGTATAATGCTAAGTTGAGATGATATCAT No Hit 8 0.41%
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA No Hit 1 0.40%
CCCAAACCCACTCCACCCTACTACCAGACAACCTTAGCCAAACCATTTAC No Hit 8 0.39%
CCCTGTTCTTGGGTGGGTGTGGGTATAATACTAAGTTGAGATGATATCAT No Hit 8 0.39%
GTATAATACTAAGTTGAGATGATATCATTTACGGGGGAAGGCGCTTTGTG No Hit 8 0.39%
CGGGGGAAGGCGCTTTGTGAAGTAGGCCTTATTTCTCTTGTCCTTTCGTA No Hit 16 0.36%
GTGGGTATAATGCTAAGTTGAGATGATATCATTTACGGGGGAAGGCGCTT No Hit 8 0.33%
CTCTCTACAAGGTTTTTTCCTAGTGTCCAAAGAGCTGTTCCTCTTTGGAC No Hit 16 0.33%
CTTATTTCTCTTGTCCTTTCGTACAGGGAGGAATTTGAAGTAGATAGAAA No Hit 16 0.32%
GGGTATAATACTAAGTTGAGATGATATCATTTACGGGGGAAGGCGCTTTG No Hit 8 0.31%
CTCAGACCGCGTTCTCTCCCTCTCACTCCCCAATACGGAGAGAAGAACGA No Hit 12 0.31%
GTAAGATTTGCCGAGTTCCTTTTACTTTTTTTAACCTTTCCTTATGGGCA No Hit 8 0.31%
CTGAACTCCTCACACCCAATTGGACCAATCTATCACCCTATAGAAGAACT No Hit 16 0.29%
GTCCAATTGGGTGTGAGGAGTTCAGTTATATGTTTGGGATTTTTTAGGTA No Hit 12 0.28%
CCTCCTTAGGCAACCTGGTGGTCCCCCGCTCCCGGGAGGTCACCATATTG No Hit 16 0.28%
CTGGTTTCGGGGGTCTTAGCTTTGGCTCTCCTTGCAAAGTTATTTCTAGT No Hit 15 0.28%
CTCTAGAATAGGATTGCGCTGTTATCCCTAGGGTAACTTGTTCCGTTGGT No Hit 16 0.26%
CTGTTCTTGGGTGGGTGTGGGTATAATACTAAGTTGAGATGATATCATTT No Hit 5 0.25%
CTCCGAGGTCGCCCCAACCGAAATTTTTAATGCAGGTTTGGTAGTTTAGG No Hit 16 0.25%
CTCAGGCTGGAGTGCAGTGGCTATTCACAGGCGCGATCCCACTACTGATC No Hit 13 0.25%

Trimmed data

Raw libraries were trimmed using cutadapt v1.14 to remove Illumina adapter sequences. Bases with PHRED score < 30, NextSeq-induced polyG runs and reads shorter than 35bp were also removed.

trimFqc <- list.files(
  path = "/hpcfs/users/a1647910/20200310_rRNADepletion/4_T47D_ZR75_DHT_StrippedSerum/1_trimmedData/FastQC",
  pattern = "zip",
  full.names = TRUE
) %>%
  FastqcDataList()
trimStats <- readTotals(rawFqc) %>%
  dplyr::rename(Raw = Total_Sequences) %>%
  left_join(readTotals(trimFqc), by = "Filename") %>%
  dplyr::rename(Trimmed = Total_Sequences) %>%
  mutate(
    Discarded = 1 - Trimmed/Raw,
    Retained = Trimmed / Raw
  )

After trimming of adapters between 6.09% and 8.36% of reads were discarded.

Aligned data

Trimmed reads were:

  1. Aligned to rRNA sequences using the BWA-MEM algorithm to estimate the proportion of reads that were of rRNA origin within each sample. BWA-MEM is recommended for high-quality queries of reads ranging from 70bp to 1Mbp as it is faster and more accurate that alternative algorithms BWA-backtrack and BWA-SW.

  2. Aligned to the Homo sapiens GRCh38 genome (Ensembl release 101) using STAR v2.7.0d and summarised with featureCounts from the Subread v1.5.2 package. These counts were used for all gene-level analysis.

rRNA proportions

rRnaProp <- read.delim(
  "/hpcfs/users/a1647910/20200310_rRNADepletion/4_T47D_ZR75_DHT_StrippedSerum/3_bwa/log/samples.mapped.all", 
  sep = ":", 
  col.names = c("sample", "proportion"), 
  header = FALSE
) %>% 
  mutate(
    sample = str_remove_all(sample, ".mapped"),
    sample = basename(sample),
    proportion = proportion/100,
    dataset = "StrippedSerum",
    organism = "human"
  ) %>%
  as_tibble()
rRnaProp$dataset %<>%
  factor(levels = c("StrippedSerum"))
rRnaProp %>%
  ggplot(aes(sample, proportion)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~dataset, scales = "free_x") +
  scale_y_continuous(labels = percent) +
  labs(x = "Sample", y = "Percent of Total", fill = "Read pair") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
*Percentages of each library that align to rRNA sequences with `bwa mem`.*

Percentages of each library that align to rRNA sequences with bwa mem.

Gene GC content and length

dgeList <- read_tsv("/hpcfs/users/a1647910/20200310_rRNADepletion/4_T47D_ZR75_DHT_StrippedSerum/4_star2pass/featureCounts/genes.out") %>%
  set_colnames(basename(colnames(.))) %>%
  set_colnames(str_remove(colnames(.), "Aligned.sortedByCoord.out.bam")) %>%
  as.data.frame() %>%
  column_to_rownames("Geneid") %>%
  DGEList() %>%
  calcNormFactors()
metaData <- read_tsv("/hpcfs/users/a1647910/20200310_rRNADepletion/4_T47D_ZR75_DHT_StrippedSerum/T47D_ZR75_DHT_StrippedSerum.tsv")
dgeList$genes <- genesGR[rownames(dgeList),]
mcols(dgeList$genes) %<>% 
  as.data.frame() %>% 
  left_join(geneGcLen)
addInfo <- tibble(
  sample = rRnaProp$sample,
  dataset = "StrippedSerum",
  organism = "human",
  rRNA = rRnaProp$proportion
) %>%
  left_join(metaData)
dgeList$samples %<>%
  rownames_to_column("rowname") %>%
  mutate(sample = rowname) %>%
  left_join(addInfo) %>%
  column_to_rownames("rowname")
dgeList$samples$filenames <- list.files(
  "/hpcfs/users/a1647910/20200310_rRNADepletion/4_T47D_ZR75_DHT_StrippedSerum/2_alignedData/bam", 
  pattern = ".bam$", 
  full.names = TRUE
)
# dgeList$samples$group <- colnames(dgeList) %>%
#   str_extract("(D|V)") %>%
#   factor(levels = c("D", "V"))
gcInfo <- function(x) {
  x$counts %>%
    as.data.frame() %>%
    rownames_to_column("gene_id") %>%
    as_tibble() %>%
    pivot_longer(
      cols = colnames(x), 
      names_to = "sample", 
      values_to = "counts"
    ) %>%
    dplyr::filter(
      counts > 0
    ) %>%
    left_join(
      geneGcLen
    ) %>%
    dplyr::select(
      ends_with("id"), sample, counts, aveGc, maxLen
    ) %>%
    split(f = .$sample) %>%
    lapply(
      function(x){
        DataFrame(
          gc = Rle(x$aveGc, x$counts),
          logLen = Rle(log10(x$maxLen), x$counts)
        )
      }
    ) 
}
gcSummary <- function(x) {
  x %>%
    vapply(function(x){
      c(mean(x$gc), sd(x$gc), mean(x$logLen), sd(x$logLen))
    }, numeric(4)
    ) %>%
    t() %>%
    set_colnames(
      c("mn_gc", "sd_gc", "mn_logLen", "sd_logLen")
    ) %>%
    as.data.frame() %>%
    rownames_to_column("sample") %>%
    as_tibble()
}
rle <- gcInfo(dgeList)
sumGc <- gcSummary(rle)
a <- sumGc %>%
  left_join(dgeList$samples) %>%
  ggplot(aes(rRNA, mn_logLen)) +
  geom_point(aes(colour = treat, shape = cell_line), size = 3) +
  geom_smooth(method = "lm") +
  scale_x_continuous(labels = percent) +
  labs(
    x = "rRNA Proportion of Initial Library",
    y = "Mean log(Length)",
    colour = "Treatment",
    shape = "Cell line"
  ) 
b <- sumGc %>%
  left_join(dgeList$samples) %>%
  ggplot(aes(rRNA, mn_gc)) +
  geom_point(aes(colour = treat, shape = cell_line), size = 3) +
  geom_smooth(method = "lm") +
  scale_y_continuous(labels = percent) +
  scale_x_continuous(labels = percent) +
  labs(
    x = "rRNA Proportion of Initial Library",
    y = "Mean GC Content",
    colour = "Treatment",
    shape = "Cell line"
  )
ggarrange(
  a, b, ncol = 2, nrow = 1, 
  common.legend = TRUE, legend = "bottom"
) %>%
  annotate_figure("StrippedSerum Dataset (H. sapiens)")
*Comparison of residual bias potentially introduced by incomplete rRNA removal. Regression lines are shown along with standard error bands for each comparison.*

Comparison of residual bias potentially introduced by incomplete rRNA removal. Regression lines are shown along with standard error bands for each comparison.

PCA

genes2keep <- dgeList %>%
  cpm() %>%
  is_greater_than(1) %>%
  rowSums() %>%
  is_weakly_greater_than(6)
dgeFilt <- dgeList[genes2keep,, keep.lib.sizes = FALSE] %>%
  calcNormFactors()
pca <- cpm(dgeFilt, log = TRUE) %>%
  t() %>%
  prcomp()
pcaCor <- pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  left_join(sumGc) %>%
  as_tibble() %>% 
  left_join(dgeList$samples) %>%
  dplyr::select(
    PC1, PC2, PC3, 
    Mean_GC = mn_gc, 
    Mean_Length = mn_logLen, 
    rRna_Proportion = rRNA
  ) %>% 
  cor()
a <- pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  left_join(dgeList$samples) %>%
  as_tibble() %>%
  ggplot(aes(PC1, PC2)) +
  geom_point(aes(colour = treat, shape = cell_line), size = 2) +
  labs(
    x = paste0("PC1 (", percent(summary(pca)$importance["Proportion of Variance","PC1"]),")"),
    y = paste0("PC2 (", percent(summary(pca)$importance["Proportion of Variance","PC2"]),")"),
    colour = "Treatment",
    shape = "Cell line"
  )
b <- pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  left_join(dgeList$samples) %>%
  ggplot(aes(PC1, rRNA, label = rRNA)) +
  geom_point(aes(colour = treat, shape = cell_line), size = 2) +
  geom_smooth(method = "lm") +
  geom_text_repel(show.legend = FALSE) +
  scale_y_continuous(labels = percent) +
  labs(
    x = paste0("PC1 (", percent(summary(pca)$importance["Proportion of Variance","PC1"]),")"),
    y = "rRNA Proportion of Initial Library",
    colour = "Treatment",
    shape = "Cell line"
  )
c <- pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  left_join(sumGc) %>%
  left_join(dgeList$samples) %>%
  as_tibble() %>%
  ggplot(aes(PC1, mn_logLen)) +
  geom_point(aes(colour = treat, shape = cell_line), size = 2) +
  geom_smooth(method = "lm") +
  labs(
    x = paste0("PC1 (", percent(summary(pca)$importance["Proportion of Variance","PC1"]),")"),
    y = "Mean log(Length)",
    colour = "Treatment",
    shape = "Cell line"
  )
d <- pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  left_join(sumGc) %>%
  left_join(dgeList$samples) %>%
  as_tibble() %>%
  ggplot(aes(PC1, mn_gc)) +
  geom_point(aes(colour = treat, shape = cell_line), size = 2) +
  geom_smooth(method = "lm") +
  scale_y_continuous(labels = percent) +
  labs(
    x = paste0("PC1 (", percent(summary(pca)$importance["Proportion of Variance","PC1"]),")"),
    y = "Mean GC",
    colour = "Treatment",
    shape = "Cell line"
  )
ggarrange(
  a, b, c, d, ncol = 2, nrow = 2,
  common.legend = TRUE, legend = "bottom"
) %>%
  annotate_figure("StrippedSerum")
*PCA plot showing rRNA proportion, mean GC content and mean log(length) after summarisation to gene-level.*

PCA plot showing rRNA proportion, mean GC content and mean log(length) after summarisation to gene-level.

corrplot(
  pcaCor,
  type = "lower", 
  diag = FALSE, 
  addCoef.col = 1, addCoefasPercent = TRUE
)
*Correlations between the first three principal components and measured variables: mean GC content, mean log(length) and rRNA proportion.*

Correlations between the first three principal components and measured variables: mean GC content, mean log(length) and rRNA proportion.

Differential expression

design <- model.matrix(~rRNA, data = dgeFilt$samples)
voom <- voom(dgeFilt, design = design)
fit <- lmFit(voom, design = design)
eBayes <- eBayes(fit)
topTable <- eBayes %>%
  topTable(coef = colnames(design)[2], sort.by = "p", n = Inf) %>%
  set_colnames(str_remove(colnames(.), "ID\\.")) %>%
  mutate(Bonf = p.adjust(P.Value, "bonferroni")) %>%
  mutate(DE = Bonf < 0.05) %>%
  unite(Location, c(seqnames, start, end, width, strand), sep = ":") %>%
  dplyr::select(
    Geneid = gene_id,
    Symbol = gene_name,
    AveExpr,
    logFC,
    P.Value,
    FDR = adj.P.Val,
    Location,
    t,
    DE,
    everything(),
    -B
  ) %>%
  as_tibble()
topTable %>% 
  dplyr::select(Geneid, Symbol, AveExpr, logFC, P.Value, FDR, DE) %>%
  mutate(
    AveExpr = format(round(AveExpr, 2), nsmall = 2),
    logFC = format(round(logFC, 2), nsmall = 2),
    P.Value = sprintf("%.2e", P.Value),
    FDR = sprintf("%.2e", FDR)
  ) %>%
  dplyr::slice(1:200) %>%
  datatable(
    options = list(pageLength = 20), 
    class = "striped hover condensed responsive", 
    filter = "top",
    caption = paste(
      "The top 100 differentially expressed genes.",
      nrow(dplyr::filter(topTable, DE)),
      "of",
      nrow(topTable),
      "genes were classified as DE with an Bonferroni p-value < 0.05."
    )
  )

k-mer analysis

jellyfish v2.3.0 was used to count kmers of trimmed fastq files that had been filtered for rRNA sequences. This was performed for 5 values: \(k = 5, 6, 7, 8, 9, 10\). Lower values of \(k\) lose specificity in comparison to higher values, however as \(k\) increases, the exponential increase of possible kmers causes limitations due to computational processing time.

Setup

k = 5

Counts

k5files <- list.files("/hpcfs/users/a1647910/20200310_rRNADepletion/4_T47D_ZR75_DHT_StrippedSerum/6_jellyfish2pass/k5", pattern = "_dumps.txt", full.names = TRUE)
k5counts <- lapply(k5files, function(x){
  read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
    set_colnames(str_remove_all(colnames(.), "_dumps\\.txt"))
}) %>%
  purrr::reduce(full_join) %>%
  dplyr::select(mer, contains(c("D", "V")))
k5dge <- k5counts %>%
  as.data.frame() %>%
  column_to_rownames("mer") %>%
  DGEList() %>%
  calcNormFactors()
k5dge$samples %<>%
  rownames_to_column("rowname") %>%
  mutate(sample = rowname) %>%
  left_join(addInfo) %>%
  column_to_rownames("rowname")

Properties

k5dist <- k5dge %>%
  cpm(log = TRUE) %>%
  as.data.frame() %>%
  pivot_longer(everything(), names_to = "sample", values_to = "count") %>%
  ggplot(aes(x=count, colour = sample)) +
  geom_density() +
  labs(x = "intensity", title = "Distribution of 5-mers")
k5labels <- k5dge$samples %>% 
  mutate(label = paste0(sample, "\n", percent(rRNA, accuracy = 0.01), " rRNA")) %>% 
  .$label
k5heat <- k5dge %>% 
  cpm(log = TRUE) %>%
  as.data.frame() %>%
  t() %>%
  pheatmap(silent = TRUE, cluster_cols = FALSE,
           show_colnames = FALSE, fontsize = 9,
           fontsize_row = 10, border_color = NA,
           main = "5-mer counts heatmap", labels_row = k5labels)
k5heat$tree_row$labels <- k5labels
k5den <- ggdendrogram(k5heat$tree_row, rotate = TRUE) +
  labs(title = "Hierarchical clustering of 5-mer counts") +
  theme(plot.title = element_text(size = 12))
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k5pca <- k5dge %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k5pca)$importance %>% pander(split.tables = Inf)
k5pcaPlot <- k5pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(k5dge$samples) %>%
  ggplot(aes(PC1, PC2, colour = treat, shape = cell_line, label = rRNA)) +
  geom_point(alpha = 0.8, size = 3) +
  geom_text_repel(show.legend = FALSE) +
  labs(
    x = paste0("PC1 (", percent(summary(k5pca)$importance[2, "PC1"]), ")"),
    y = paste0("PC2 (", percent(summary(k5pca)$importance[2, "PC2"]), ")"),
    colour = "Treatment",
    shape = "Cell line",
    title = "k = 5"
  )
k5pcaRrna <- k5pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(k5dge$samples) %>%
  ggplot(aes(PC1, rRNA, label = rRNA)) +
  geom_point(aes(colour = treat, shape = cell_line), alpha = 0.8, size = 3) +
  geom_text_repel(show.legend = FALSE) +
  geom_smooth(method = "lm") +
  labs(
    x = paste0("PC1 (", percent(summary(k5pca)$importance[2, "PC1"]), ")"),
    y = "rRNA proportion",
    colour = "Treatment",
    shape = "Cell line",
    title = "k = 5"
  ) +
  scale_y_continuous(labels = percent)

Differential expression

k5design <- model.matrix(~rRNA, data = k5dge$samples)
k5voom <- voom(k5dge, design = k5design)
k5fit <- lmFit(k5voom, design = k5design)
k5eBayes <- eBayes(k5fit)
k5topTable <- k5eBayes %>%
  topTable(coef = colnames(k5design)[2], sort.by = "p", n = Inf) %>%
  set_colnames(str_remove(colnames(.), "ID\\.")) %>%
  rownames_to_column("mer") %>%
  mutate(BY = p.adjust(P.Value, "BY")) %>%
  mutate(DE = BY < 0.05) %>%
  dplyr::select(
    mer,
    AveExpr,
    logFC,
    P.Value,
    FDR = adj.P.Val,
    BY,
    t,
    DE,
    everything(),
    -B
  ) %>%
  as_tibble()

k = 6

Counts

k6files <- list.files("/hpcfs/users/a1647910/20200310_rRNADepletion/4_T47D_ZR75_DHT_StrippedSerum/6_jellyfish2pass/k6", pattern = "_dumps.txt", full.names = TRUE)
k6counts <- lapply(k6files, function(x){
  read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
    set_colnames(str_remove_all(colnames(.), "_dumps\\.txt"))
}) %>%
  purrr::reduce(full_join) %>%
  dplyr::select(mer, contains(c("D", "V")))
k6dge <- k6counts %>%
  as.data.frame() %>%
  column_to_rownames("mer") %>%
  DGEList() %>%
  calcNormFactors()
k6dge$samples %<>%
  rownames_to_column("rowname") %>%
  mutate(sample = rowname) %>%
  left_join(addInfo) %>%
  column_to_rownames("rowname")
k6dge$samples$group <- colnames(k6dge) %>%
  str_extract("(D|V)") %>%
  factor(levels = c("D", "V"))

Properties

k6dist <- k6dge %>%
  cpm(log = TRUE) %>%
  as.data.frame() %>%
  pivot_longer(everything(), names_to = "sample", values_to = "count") %>%
  ggplot(aes(x=count, colour = sample)) +
  geom_density() +
  labs(x = "intensity", title = "Distribution of 6-mers")
k6labels <- k6dge$samples %>% 
  mutate(label = paste0(sample, "\n", percent(rRNA, accuracy = 0.01), " rRNA")) %>% 
  .$label
k6heat <- k6dge %>% 
  cpm(log = TRUE) %>%
  as.data.frame() %>%
  t() %>%
  pheatmap(silent = TRUE, cluster_cols = FALSE,
           show_colnames = FALSE, fontsize = 9,
           fontsize_row = 10, border_color = NA,
           main = "6-mer counts heatmap", labels_row = k6labels)
k6heat$tree_row$labels <- k6labels
k6den <- ggdendrogram(k6heat$tree_row, rotate = TRUE) +
  labs(title = "Hierarchical clustering of 6-mer counts") +
  theme(plot.title = element_text(size = 12))
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k6pca <- k6dge %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k6pca)$importance %>% pander(split.tables = Inf)
# Plot PCA
k6pcaPlot <- k6pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(k6dge$samples) %>%
  ggplot(aes(PC1, PC2, colour = treat, shape = cell_line, label = rRNA)) +
  geom_point(alpha = 0.8, size = 3) +
  geom_text_repel(show.legend = FALSE) +
  labs(
    x = paste0("PC1 (", percent(summary(k6pca)$importance[2, "PC1"]), ")"),
    y = paste0("PC2 (", percent(summary(k6pca)$importance[2, "PC2"]), ")"),
    colour = "Treatment",
    shape = "Cell line",
    title = "k = 6"
  )
k6pcaRrna <- k6pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(k6dge$samples) %>%
  ggplot(aes(PC1, rRNA, label = rRNA)) +
  geom_point(aes(colour = treat, shape = cell_line), alpha = 0.8, size = 3) +
  geom_text_repel(show.legend = FALSE) +
  geom_smooth(method = "lm") +
  labs(
    x = paste0("PC1 (", percent(summary(k6pca)$importance[2, "PC1"]), ")"),
    y = "rRNA proportion",
    colour = "Treatment",
    shape = "Cell line",
    title = "k = 6"
  ) +
  scale_y_continuous(labels = percent)

Differential expression

k6design <- model.matrix(~rRNA, data = k6dge$samples)
k6voom <- voom(k6dge, design = k6design)
k6fit <- lmFit(k6voom, design = k6design)
k6eBayes <- eBayes(k6fit)
k6topTable <- k6eBayes %>%
  topTable(coef = colnames(k6design)[2], sort.by = "p", n = Inf) %>%
  set_colnames(str_remove(colnames(.), "ID\\.")) %>%
  rownames_to_column("mer") %>%
  mutate(BY = p.adjust(P.Value, "BY")) %>%
  mutate(DE = BY < 0.05) %>%
  dplyr::select(
    mer,
    AveExpr,
    logFC,
    P.Value,
    FDR = adj.P.Val,
    BY,
    t,
    DE,
    everything(),
    -B
  ) %>%
  as_tibble()

k = 7

Counts

k7files <- list.files("/hpcfs/users/a1647910/20200310_rRNADepletion/4_T47D_ZR75_DHT_StrippedSerum/6_jellyfish2pass/k7", pattern = "_dumps.txt", full.names = TRUE)
k7counts <- lapply(k7files, function(x){
  read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
    set_colnames(str_remove_all(colnames(.), "_dumps\\.txt"))
}) %>%
  purrr::reduce(full_join) %>%
  dplyr::select(mer, contains(c("D", "V")))
k7dge <- k7counts %>%
  as.data.frame() %>%
  column_to_rownames("mer") %>%
  DGEList() %>%
  calcNormFactors()
k7dge$samples %<>%
  rownames_to_column("rowname") %>%
  mutate(sample = rowname) %>%
  left_join(addInfo) %>%
  column_to_rownames("rowname")
k7dge$samples$group <- colnames(k7dge) %>%
  str_extract("(D|V)") %>%
  factor(levels = c("D", "V"))

Properties

k7dist <- k7dge %>%
  cpm(log = TRUE) %>%
  as.data.frame() %>%
  pivot_longer(everything(), names_to = "sample", values_to = "count") %>%
  ggplot(aes(x=count, colour = sample)) +
  geom_density() +
  labs(x = "intensity", title = "Distribution of 7-mers")
k7labels <- k7dge$samples %>% 
  mutate(label = paste0(sample, "\n", percent(rRNA, accuracy = 0.01), " rRNA")) %>% 
  .$label
k7heat <- k7dge %>% 
  cpm(log = TRUE) %>%
  as.data.frame() %>%
  t() %>%
  pheatmap(silent = TRUE, cluster_cols = FALSE,
           show_colnames = FALSE, fontsize = 9,
           fontsize_row = 10, border_color = NA,
           main = "7-mer counts heatmap", labels_row = k7labels)
k7heat$tree_row$labels <- k7labels
k7den <- ggdendrogram(k7heat$tree_row, rotate = TRUE) +
  labs(title = "Hierarchical clustering of 7-mer counts") +
  theme(plot.title = element_text(size = 12))
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k7pca <- k7dge %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k7pca)$importance %>% pander(split.tables = Inf)
# Plot PCA
k7pcaPlot <- k7pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(k7dge$samples) %>%
  ggplot(aes(PC1, PC2, colour = treat, shape = cell_line, label = rRNA)) +
  geom_point(alpha = 0.8, size = 3) +
  geom_text_repel(show.legend = FALSE) +
  labs(
    x = paste0("PC1 (", percent(summary(k7pca)$importance[2, "PC1"]), ")"),
    y = paste0("PC2 (", percent(summary(k7pca)$importance[2, "PC2"]), ")"),
    colour = "Treatment",
    shape = "Cell line",
    title = "k = 7"
  )
k7pcaRrna <- k7pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(k7dge$samples) %>%
  ggplot(aes(PC1, rRNA, label = rRNA)) +
  geom_point(aes(colour = treat, shape = cell_line), alpha = 0.8, size = 3) +
  geom_text_repel(show.legend = FALSE) +
  geom_smooth(method = "lm") +
  labs(
    x = paste0("PC1 (", percent(summary(k7pca)$importance[2, "PC1"]), ")"),
    y = "rRNA proportion",
    colour = "Treatment",
    title = "k = 7"
  ) +
  scale_y_continuous(labels = percent)

Differential expression

k7design <- model.matrix(~rRNA, data = k7dge$samples)
k7voom <- voom(k7dge, design = k7design)
k7fit <- lmFit(k7voom, design = k7design)
k7eBayes <- eBayes(k7fit)
k7topTable <- k7eBayes %>%
  topTable(coef = colnames(k7design)[2], sort.by = "p", n = Inf) %>%
  set_colnames(str_remove(colnames(.), "ID\\.")) %>%
  rownames_to_column("mer") %>%
  mutate(BY = p.adjust(P.Value, "BY")) %>%
  mutate(DE = BY < 0.05) %>%
  dplyr::select(
    mer,
    AveExpr,
    logFC,
    P.Value,
    FDR = adj.P.Val,
    BY,
    t,
    DE,
    everything(),
    -B
  ) %>%
  as_tibble()

k = 8

Counts

k8files <- list.files("/hpcfs/users/a1647910/20200310_rRNADepletion/4_T47D_ZR75_DHT_StrippedSerum/6_jellyfish2pass/k8", pattern = "_dumps.txt", full.names = TRUE)
k8counts <- lapply(k8files, function(x){
  read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
    set_colnames(str_remove_all(colnames(.), "_dumps\\.txt"))
}) %>%
  purrr::reduce(full_join) %>%
  dplyr::select(mer, contains(c("D", "V")))
k8dge <- k8counts %>%
  as.data.frame() %>%
  column_to_rownames("mer") %>%
  DGEList() %>%
  calcNormFactors()
k8dge$samples %<>%
  rownames_to_column("rowname") %>%
  mutate(sample = rowname) %>%
  left_join(addInfo) %>%
  column_to_rownames("rowname")
k8dge$samples$group <- colnames(k8dge) %>%
  str_extract("(D|V)") %>%
  factor(levels = c("D", "V"))

Properties

k8dist <- k8dge %>%
  cpm(log = TRUE) %>%
  as.data.frame() %>%
  pivot_longer(everything(), names_to = "sample", values_to = "count") %>%
  ggplot(aes(x=count, colour = sample)) +
  geom_density() +
  labs(x = "intensity", title = "Distribution of 8-mers")
k8labels <- k8dge$samples %>% 
  mutate(label = paste0(sample, "\n", percent(rRNA, accuracy = 0.01), " rRNA")) %>% 
  .$label
k8heat <- k8dge %>% 
  cpm(log = TRUE) %>%
  as.data.frame() %>%
  t() %>%
  pheatmap(silent = TRUE, cluster_cols = FALSE,
           show_colnames = FALSE, fontsize = 9,
           fontsize_row = 10, border_color = NA,
           main = "8-mer counts heatmap", labels_row = k8labels)
k8heat$tree_row$labels <- k8labels
k8den <- ggdendrogram(k8heat$tree_row, rotate = TRUE) +
  labs(title = "Hierarchical clustering of 8-mer counts") +
  theme(plot.title = element_text(size = 12))
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k8pca <- k8dge %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k8pca)$importance %>% pander(split.tables = Inf)
# Plot PCA
k8pcaPlot <- k8pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(k8dge$samples) %>%
  ggplot(aes(PC1, PC2, colour = treat, shape = cell_line, label = rRNA)) +
  geom_point(alpha = 0.8, size = 3) +
  geom_text_repel(show.legend = FALSE) +
  labs(
    x = paste0("PC1 (", percent(summary(k8pca)$importance[2, "PC1"]), ")"),
    y = paste0("PC2 (", percent(summary(k8pca)$importance[2, "PC2"]), ")"),
    colour = "Treatment",
    shape = "Cell line",
    title = "k = 8"
  )
k8pcaRrna <- k8pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(k8dge$samples) %>%
  ggplot(aes(PC1, rRNA, label = rRNA)) +
  geom_point(aes(colour = treat, shape = cell_line), alpha = 0.8, size = 3) +
  geom_text_repel(show.legend = FALSE) +
  geom_smooth(method = "lm") +
  labs(
    x = paste0("PC1 (", percent(summary(k8pca)$importance[2, "PC1"]), ")"),
    y = "rRNA proportion",
    colour = "Treatment",
    shape = "Cell line",
    title = "k = 8"
  ) +
  scale_y_continuous(labels = percent)

Differential expression

k8design <- model.matrix(~rRNA, data = k8dge$samples)
k8voom <- voom(k8dge, design = k8design)
k8fit <- lmFit(k8voom, design = k8design)
k8eBayes <- eBayes(k8fit)
k8topTable <- k8eBayes %>%
  topTable(coef = colnames(k8design)[2], sort.by = "p", n = Inf) %>%
  set_colnames(str_remove(colnames(.), "ID\\.")) %>%
  rownames_to_column("mer") %>%
  mutate(BY = p.adjust(P.Value, "BY")) %>%
  mutate(DE = BY < 0.05) %>%
  dplyr::select(
    mer,
    AveExpr,
    logFC,
    P.Value,
    FDR = adj.P.Val,
    BY,
    t,
    DE,
    everything(),
    -B
  ) %>%
  as_tibble()

k = 9

Counts

k9files <- list.files("/hpcfs/users/a1647910/20200310_rRNADepletion/4_T47D_ZR75_DHT_StrippedSerum/6_jellyfish2pass/k9", pattern = "_dumps.txt", full.names = TRUE)
k9counts <- lapply(k9files, function(x){
  read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
    set_colnames(str_remove_all(colnames(.), "_dumps\\.txt"))
}) %>%
  purrr::reduce(full_join) %>%
  dplyr::select(mer, contains(c("D", "V")))
k9dge <- k9counts %>%
  as.data.frame() %>%
  column_to_rownames("mer") %>%
  DGEList() %>%
  calcNormFactors()
k9dge$samples %<>%
  rownames_to_column("rowname") %>%
  mutate(sample = rowname) %>%
  left_join(addInfo) %>%
  column_to_rownames("rowname")
k9dge$samples$group <- colnames(k9dge) %>%
  str_extract("(D|V)") %>%
  factor(levels = c("D", "V"))

Properties

k9dist <- k9dge %>%
  cpm(log = TRUE) %>%
  as.data.frame() %>%
  pivot_longer(everything(), names_to = "sample", values_to = "count") %>%
  ggplot(aes(x=count, colour = sample)) +
  geom_density() +
  labs(x = "intensity", title = "Distribution of 9-mers")
k9labels <- k9dge$samples %>% 
  mutate(label = paste0(sample, "\n", percent(rRNA, accuracy = 0.01), " rRNA")) %>% 
  .$label
k9heat <- k9dge %>% 
  cpm(log = TRUE) %>%
  as.data.frame() %>%
  t() %>%
  pheatmap(silent = TRUE, cluster_cols = FALSE,
           show_colnames = FALSE, fontsize = 9,
           fontsize_row = 10, border_color = NA,
           main = "9-mer counts heatmap", labels_row = k9labels)
k9heat$tree_row$labels <- k9labels
k9den <- ggdendrogram(k9heat$tree_row, rotate = TRUE) +
  labs(title = "Hierarchical clustering of 9-mer counts") +
  theme(plot.title = element_text(size = 12))
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k9pca <- k9dge %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k9pca)$importance %>% pander(split.tables = Inf)
# Plot PCA
k9pcaPlot <- k9pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(k9dge$samples) %>%
  ggplot(aes(PC1, PC2, colour = treat, shape = cell_line, label = rRNA)) +
  geom_point(alpha = 0.8, size = 3) +
  geom_text_repel(show.legend = FALSE) +
  labs(
    x = paste0("PC1 (", percent(summary(k9pca)$importance[2, "PC1"]), ")"),
    y = paste0("PC2 (", percent(summary(k9pca)$importance[2, "PC2"]), ")"),
    colour = "Treatment",
    shape = "Cell line",
    title = "k = 9"
  )
k9pcaRrna <- k9pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(k9dge$samples) %>%
  ggplot(aes(PC1, rRNA, label = rRNA)) +
  geom_point(aes(colour = treat, shape = cell_line), alpha = 0.8, size = 3) +
  geom_text_repel(show.legend = FALSE) +
  geom_smooth(method = "lm") +
  labs(
    x = paste0("PC1 (", percent(summary(k9pca)$importance[2, "PC1"]), ")"),
    y = "rRNA proportion",
    colour = "Treatment",
    shape = "Cell line",
    title = "k = 9"
  ) +
  scale_y_continuous(labels = percent)

Differential expression

k9design <- model.matrix(~rRNA, data = k9dge$samples)
k9voom <- voom(k9dge, design = k9design)
k9fit <- lmFit(k9voom, design = k9design)
k9eBayes <- eBayes(k9fit)
k9topTable <- k9eBayes %>%
  topTable(coef = colnames(k9design)[2], sort.by = "p", n = Inf) %>%
  set_colnames(str_remove(colnames(.), "ID\\.")) %>%
  rownames_to_column("mer") %>%
  mutate(BY = p.adjust(P.Value, "BY")) %>%
  mutate(DE = BY < 0.05) %>%
  dplyr::select(
    mer,
    AveExpr,
    logFC,
    P.Value,
    FDR = adj.P.Val,
    BY,
    t,
    DE,
    everything(),
    -B
  ) %>%
  as_tibble()

k = 10

Counts

k10files <- list.files("/hpcfs/users/a1647910/20200310_rRNADepletion/4_T47D_ZR75_DHT_StrippedSerum/6_jellyfish2pass/k10", pattern = "_dumps.txt", full.names = TRUE)
k10counts <- lapply(k10files, function(x){
  read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
    set_colnames(str_remove_all(colnames(.), "_dumps\\.txt"))
}) %>%
  purrr::reduce(full_join) %>%
  dplyr::select(mer, contains(c("D", "V")))
k10counts[is.na(k10counts)] <- 0
k10dge <- k10counts %>%
  as.data.frame() %>%
  column_to_rownames("mer") %>%
  DGEList() %>%
  calcNormFactors()
k10dge$samples %<>%
  rownames_to_column("rowname") %>%
  mutate(sample = rowname) %>%
  left_join(addInfo) %>%
  column_to_rownames("rowname")
k10dge$samples$group <- colnames(k10dge) %>%
  str_extract("(D|V)") %>%
  factor(levels = c("D", "V"))

Properties

k10dist <- k10dge %>%
  cpm(log = TRUE) %>%
  as.data.frame() %>%
  pivot_longer(everything(), names_to = "sample", values_to = "count") %>%
  ggplot(aes(x=count, colour = sample)) +
  geom_density() +
  labs(x = "intensity", title = "Distribution of 10-mers")
k10labels <- k10dge$samples %>% 
  mutate(label = paste0(sample, "\n", percent(rRNA, accuracy = 0.01), " rRNA")) %>% 
  .$label
k10heat <- k10dge %>% 
  cpm(log = TRUE) %>%
  as.data.frame() %>%
  t() %>%
  pheatmap(silent = TRUE, cluster_cols = FALSE,
           show_colnames = FALSE, fontsize = 9,
           fontsize_row = 10, border_color = NA,
           main = "10-mer counts heatmap", labels_row = k10labels)
k10heat$tree_row$labels <- k10labels
k10den <- ggdendrogram(k10heat$tree_row, rotate = TRUE) +
  labs(title = "Hierarchical clustering of 10-mer counts") +
  theme(plot.title = element_text(size = 12))
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k10pca <- k10dge %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k10pca)$importance %>% pander(split.tables = Inf)
# Plot PCA
k10pcaPlot <- k10pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(k10dge$samples) %>%
  ggplot(aes(PC1, PC2, colour = treat, shape = cell_line, label = rRNA)) +
  geom_point(alpha = 0.8, size = 3) +
  geom_text_repel(show.legend = FALSE) +
  labs(
    x = paste0("PC1 (", percent(summary(k10pca)$importance[2, "PC1"]), ")"),
    y = paste0("PC2 (", percent(summary(k10pca)$importance[2, "PC2"]), ")"),
    colour = "Treatment",
    shape = "Cell line",
    title = "k = 10"
  )
k10pcaRrna <- k10pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(k10dge$samples) %>%
  ggplot(aes(PC1, rRNA, label = rRNA)) +
  geom_point(aes(colour = treat, shape = cell_line), alpha = 0.8, size = 3) +
  geom_text_repel(show.legend = FALSE) +
  geom_smooth(method = "lm") +
  labs(
    x = paste0("PC1 (", percent(summary(k10pca)$importance[2, "PC1"]), ")"),
    y = "rRNA proportion",
    colour = "Treatment",
    shape = "Cell line",
    title = "k = 10"
  ) +
  scale_y_continuous(labels = percent)

Differential expression

k10design <- model.matrix(~rRNA, data = k10dge$samples)
k10voom <- voom(k10dge, design = k10design)
k10fit <- lmFit(k10voom, design = k10design)
k10eBayes <- eBayes(k10fit)
k10topTable <- k10eBayes %>%
  topTable(coef = colnames(k10design)[2], sort.by = "p", n = Inf) %>%
  set_colnames(str_remove(colnames(.), "ID\\.")) %>%
  rownames_to_column("mer") %>%
  mutate(BY = p.adjust(P.Value, "BY")) %>%
  mutate(DE = BY < 0.05) %>%
  dplyr::select(
    mer,
    AveExpr,
    logFC,
    P.Value,
    FDR = adj.P.Val,
    BY,
    t,
    DE,
    everything(),
    -B
  ) %>%
  as_tibble()

Distributions

ggarrange(
  k5dist, k6dist, k7dist, k8dist, k9dist, k10dist,
  ncol = 2, nrow = 3, common.legend = TRUE, legend = "bottom" 
)
*Distributions of kmer counts for each value of $k$*

Distributions of kmer counts for each value of \(k\)

PCA

ggarrange(
  k5pcaPlot, k6pcaPlot, k7pcaPlot, k8pcaPlot, k9pcaPlot, k10pcaPlot,
  ncol = 2, nrow = 3, common.legend = TRUE, legend = "bottom"
)
*Principal component analysis for all values of $k$.*

Principal component analysis for all values of \(k\).

ggarrange(
  k5pcaRrna, k6pcaRrna, k7pcaRrna, k8pcaRrna, k9pcaRrna, k10pcaRrna,
  ncol = 2, nrow = 3, common.legend = TRUE, legend = "bottom"
)
*Principal component analysis for all values of $k$.*

Principal component analysis for all values of \(k\).

Clustering

ggarrange(
  k5den, k6den, k7den, k8den, k9den, k10den,
  ncol = 2, nrow = 3, common.legend = TRUE, legend = "bottom"
)
*Hierarchical clustering of samples based on kmer counts.*

Hierarchical clustering of samples based on kmer counts.

Differential expression

topRes <- function(x, cap){
  x %>% 
    dplyr::select(mer, AveExpr, logFC, P.Value, FDR, BY, DE) %>%
    mutate(
      AveExpr = format(round(AveExpr, 2), nsmall = 2),
      logFC = format(round(logFC, 2), nsmall = 2),
      P.Value = sprintf("%.2e", P.Value),
      FDR = sprintf("%.2e", FDR),
      BY = sprintf("%.2e", BY)
    ) %>%
    dplyr::slice(1:100) %>%
    datatable(
      options = list(pageLength = 10),
      class = "striped hover condensed responsive",
      filter = "top",
      caption = cap
    )
}

k = 5

topRes(k5topTable,
       cap = paste(
         "The top 100 differentially expressed 5-mers.",
         nrow(dplyr::filter(k5topTable, DE)),
         "of",
         nrow(k5topTable),
         "detected sequences were classified as DE with BY p-value < 0.05."
       )
)
k5topTable %>%
  ggplot(aes(P.Value)) +
  geom_histogram(
    binwidth = 0.02,
    colour = "black", fill = "grey90"
  ) +
  labs(title = "k = 5")
*Histogram of p-values. Values follow the expected distribution when there are many differences.*

Histogram of p-values. Values follow the expected distribution when there are many differences.

k5topTable %>%
  ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
  geom_point(alpha = 0.5) +
  scale_colour_manual(values = c("black", "grey50", "red")) +
  # geom_text_repel(
  #   data = . %>%
  #     # dplyr::filter(-log10(P.Value) > 4 | logFC > 4 | logFC < -2),
  #     dplyr::filter(-log10(P.Value) > 4 | logFC > 3 | logFC < -2.5),
  #   aes(label = mer, color = "black")
  # ) +
  labs(x = "logFC", y = expression(paste(-log[10], "(p)")), title = "k = 5") +
  theme_bw() +
  theme(legend.position = "none")
*Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.*

Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.


k = 6

topRes(k6topTable,
       cap = paste(
         "The top 100 differentially expressed 6-mers.",
         nrow(dplyr::filter(k6topTable, DE)),
         "of",
         nrow(k6topTable),
         "detected sequences were classified as DE with a BY p-value < 0.05."
       )
)
k6topTable %>%
  ggplot(aes(P.Value)) +
  geom_histogram(
    binwidth = 0.02,
    colour = "black", fill = "grey90"
  ) +
  labs(title = "k = 6")
*Histogram of p-values. Values follow the expected distribution when there are some differences.*

Histogram of p-values. Values follow the expected distribution when there are some differences.

k6topTable %>%
  ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
  geom_point(alpha = 0.5) +
  scale_colour_manual(values = c("black", "grey50", "red")) +
  # geom_text_repel(
  #   data = . %>%
  #     dplyr::filter(-log10(P.Value) > 6 | logFC > 6 | logFC < -2.3),
  #   aes(label = mer, color = "black")
  # ) +
  labs(x = "logFC", y = expression(paste(-log[10], "(p)")), title = "k = 6") +
  theme_bw() +
  theme(legend.position = "none")
*Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.*

Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.


k = 7

topRes(k7topTable,
       cap = paste(
         "The top 100 differentially expressed 7-mers.",
         nrow(dplyr::filter(k7topTable, DE)),
         "of",
         nrow(k7topTable),
         "detected sequences were classified as DE with a BY p-value < 0.05."
       )
)
k7topTable %>%
  ggplot(aes(P.Value)) +
  geom_histogram(
    binwidth = 0.02,
    colour = "black", fill = "grey90"
  ) +
  labs(title = "k = 7")
*Histogram of p-values. Values follow the expected distribution when there are many differences.*

Histogram of p-values. Values follow the expected distribution when there are many differences.

k7topTable %>%
  ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
  geom_point(alpha = 0.5) +
  scale_colour_manual(values = c("black", "grey50", "red")) +
  # geom_text_repel(
  #   data = . %>%
  #     dplyr::filter(-log10(P.Value) > 6.4 | logFC > 10 | logFC < -5),
  #   aes(label = mer, color = "black")
  # ) +
  labs(x = "logFC", y = expression(paste(-log[10], "(p)")), title = "k = 7") +
  theme_bw() +
  theme(legend.position = "none")
*Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.*

Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.


k = 8

topRes(k8topTable,
       cap = paste(
         "The top 100 differentially expressed 8-mers.",
         nrow(dplyr::filter(k8topTable, DE)),
         "of",
         nrow(k8topTable),
         "detected sequences were classified as DE with a BY p-value < 0.05."
       )
)
k8topTable %>%
  ggplot(aes(P.Value)) +
  geom_histogram(
    binwidth = 0.02,
    colour = "black", fill = "grey90"
  ) +
  labs(title = "k = 8")
*Histogram of p-values. Values follow the expected distribution when there are many differences.*

Histogram of p-values. Values follow the expected distribution when there are many differences.

k8topTable %>%
  ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
  geom_point(alpha = 0.5) +
  scale_colour_manual(values = c("black", "grey50", "red")) +
  # geom_text_repel(
  #   data = . %>%
  #     dplyr::filter(-log10(P.Value) > 7.2 | logFC > 12.5 | logFC < -8),
  #   aes(label = mer, color = "black")
  # ) +
  labs(x = "logFC", y = expression(paste(-log[10], "(p)")), title = "k = 8") +
  theme_bw() +
  theme(legend.position = "none")
*Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.*

Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.


k = 9

topRes(k9topTable,
       cap = paste(
         "The top 100 differentially expressed 9-mers.",
         nrow(dplyr::filter(k9topTable, DE)),
         "of",
         nrow(k9topTable),
         "detected sequences were classified as DE with a BY p-value < 0.05."
       )
)
k9topTable %>%
  ggplot(aes(P.Value)) +
  geom_histogram(
    binwidth = 0.02,
    colour = "black", fill = "grey90"
  ) +
  labs(title = "k = 9")
*Histogram of p-values. Values follow the expected distribution when there are many differences.*

Histogram of p-values. Values follow the expected distribution when there are many differences.

k9topTable %>%
  ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
  geom_point(alpha = 0.5) +
  scale_colour_manual(values = c("black", "grey50", "red")) +
  # geom_text_repel(
  #   data = . %>%
  #     dplyr::filter(DE & -log10(P.Value) > 7.5 | logFC < -10 | logFC > 15),
  #   aes(label = mer, color = "black")
  # ) +
  labs(x = "logFC", y = expression(paste(-log[10], "(p)")), title = "k = 9") +
  theme_bw() +
  theme(legend.position = "none")
*Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.*

Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.


k = 10

topRes(k10topTable,
       cap = paste(
         "The top 100 differentially expressed 10-mers.",
         nrow(dplyr::filter(k10topTable, DE)),
         "of",
         nrow(k10topTable),
         "detected sequences were classified as DE with a BY p-value < 0.05."
       )
)
k10topTable %>%
  ggplot(aes(P.Value)) +
  geom_histogram(
    binwidth = 0.02,
    colour = "black", fill = "grey90"
  ) +
  labs(title = "k = 10")
*Histogram of p-values. Values follow the expected distribution when there are many differences.*

Histogram of p-values. Values follow the expected distribution when there are many differences.

k10topTable %>%
  ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
  geom_point(alpha = 0.5) +
  scale_colour_manual(values = c("black", "grey50", "red")) +
  # geom_text_repel(
  #   data = . %>%
  #     dplyr::filter(-log10(P.Value) > 8.5 | logFC > 20 | logFC < -13.5),
  #   aes(label = mer, color = "black")
  # ) +
  labs(x = "logFC", y = expression(paste(-log[10], "(p)")), title = "k = 10") +
  theme_bw() +
  theme(legend.position = "none")
*Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.*

Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.